library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(data.table)
## Warning: package 'data.table' was built under R version 3.2.5
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
library(tidyr)
library(knitr)
library(ggplot2)
library(ggrepel)
library(broom)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
combinewithoutuser<-read_csv("combinewithoutuser_complete.csv")
business<-readRDS("yelp_academic_dataset_business.rds")
business_simple<-business%>%filter(grepl("Restautrant|Food|Breakfast|Bars|food|Burgers|Brunch|Sandwiches|Pubs|Chinese|Italian|American|Pizza|Coffee|Tea|Fast Food|Asian|Fusion|Lounges|Cafes|Irish|Gluten|Salad|Diners|Seafood|Bakeries|Desserts|Japanese|Ice Cream & Frozen Yogurt|Tapas/Small Plates|Mediterranean|Wine Bars|Vegetarian|Portuguese|German|Delis|Chicken Wings|Hot Dogs|Polish|Greek|Sushi Bars|Indian|Mexican|Bagels|Donuts|Tapas Bars|Cocktail Bars|Ethnic Food|Middle Eastern|Steakhouses|Cafeteria|Candy Stores|Korean|Chocolatiers & Shops|Cheese Shops|Vietnamese|Thai|Tea Rooms|Latin American|Creperies|French|Taiwanese|Buffets|Cajun/Creole|Soul Food|Juice Bars & Smoothies|Fondue|Ethiopian|Persian/Iranian|Popcorn Shops|Spanish|Cheesesteaks|Fish & Chips|British|Kosher|Armenian|Cupcakes|Vegan|Hawaiian|Cuban|Gastropubs|Russian|Pretzels|Fruits & Veggies|Gelato|Halal|Dim Sum|Filipino|Pasta Shops|Mongolian|Colombian|Cantonese|Street Vendors|Belgian|Cambodian|Hungarian|Szechuan|Bubble Tea|Laotian|African|Beer Bar|Himalayan/Nepalese|Moroccan|Falafel|Indonesian|Turkish|Afghan|Food Stands|Modern European|Irish Pub|Brazilian|Food Court|Malaysian|Coffeeshops|Hot Pot|Burmese|Macarons|Ramen|Empanadas|Bistros|Teppanyaki|Brasseries|Singaporean|Champagne Bars|Scandinavian|Canadian|Poutineries|Haitian|Arabian|Austrian|Czech|Slovakian|Bangladeshi|Egyptian|Dominican|Scottish|Patisserie/Cake Shop|Pub Food|Puerto Rican|Australian|Ukrainian|Sri Lankan|Beer Garden|International|Beer Gardens|Serbo Croatian|Kebab|Alsatian|Oriental|Shanghainese|Venezuelan|Bavarian|Iberian|Curry Sausage|Rhinelandian|Beer Hall|Eastern European|Wok|Trinidadian|Swiss Food|Pita",categories))
business_simple<-business_simple%>%
filter(city%in%c("Pittsburgh","Charlotte","Urbana","Champaign","Phoenix", "Scottsdale","Tempe","Mesa", "Chandler","Gilbert","Glendale", "Las Vegas","Henderson", "Madison"))
business_simple$state[business_simple$business_id=="g49oTp73Pk_WpOfQVtmcew"] <- "NV"
## Plotting the distribution of review_counts per stars
combinewithoutuser %>% ggplot(aes(x = stars)) + geom_bar()
combinewithoutuser %>% summarize(mean = mean(stars), var = var(stars)) ## not Poisson
## Source: local data frame [1 x 2]
##
## mean var
## (dbl) (dbl)
## 1 3.54474 0.4090287
#library(gridExtra)
#grid.arrange(map_nv1,map_nv2, nrow=1)
#all_states <- map_data("state")
#ggplot() + geom_polygon( data=all_states, aes(x=long, y=lat, group = group),colour="white", fill="grey10" ) + geom_point(data=restaurant_loc, aes(longitude, latitude), col="red", cex=3) + geom_point(data=station_loc, aes(lon, lat), col="white", cex=2) + facet_wrap(~state)
The longitudes and latitudes are used to map all restaurant locations in our dataset with R GIS (Geographical Information System). The size of the dots reflects the number of restaurants in a specific area. The bigger the dot, the larger the number of restaurants in the area.
## Distribution of all restaurants in our dataset
business_simple%>%group_by(city)%>%ggplot( aes(city) ) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")
business_simple%>%group_by(state)%>%ggplot( aes(state) ) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")
restaurant_loc<-read_csv("restaurant_loc.csv")
station_loc<-read_csv("station_loc.csv")
restaurant_loc<-combinewithoutuser%>%left_join(restaurant_loc)
## Joining by: "business_id"
restaurant_loc2<-restaurant_loc%>%filter(review_count > 100)
library(RgoogleMaps)
## Warning: package 'RgoogleMaps' was built under R version 3.2.5
PlotOnStaticMap(lat = restaurant_loc$latitude, lon = restaurant_loc$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=1.4, pch=19, col="red3", FUN = points, add = F)
## NV
state<-restaurant_loc%>%filter(state=="NV")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
state2<-restaurant_loc2%>%filter(state=="NV")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
## PA
state<-restaurant_loc%>%filter(state=="PA")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
state2<-restaurant_loc2%>%filter(state=="PA")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
## NC
state<-restaurant_loc%>%filter(state=="NC")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
state2<-restaurant_loc2%>%filter(state=="NC")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
## IL
state<-restaurant_loc%>%filter(state=="IL")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
state2<-restaurant_loc2%>%filter(state=="IL")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
## AZ
state<-restaurant_loc%>%filter(state=="AZ")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
state2<-restaurant_loc2%>%filter(state=="AZ")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
## WI
state<-restaurant_loc%>%filter(state=="WI")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
state2<-restaurant_loc2%>%filter(state=="WI")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude, zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)
## Type of restaurant
combinewithoutuser %>% ggplot(aes(type)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(type)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3") +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 40, size = 10, hjust = 0.9, vjust = 0.9)) +
facet_wrap(~state)
## City
combinewithoutuser %>% ggplot(aes(city)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")
## State
combinewithoutuser %>% ggplot(aes(state)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")
## Review_count
combinewithoutuser %>% group_by(city) %>%
ggplot(aes(city, review_count, col=city)) + geom_boxplot() +
ggtitle("Review_count, stratified by city")
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, review_count, col=state)) + geom_boxplot() +
ggtitle("Review_count, stratified by state")
## Review_count < 1000
combinewithoutuser %>% filter(review_count < 1000) %>% group_by(city) %>%
ggplot(aes(city, review_count, col=city)) + geom_boxplot() +
ggtitle("Review_count, stratified by city")
combinewithoutuser %>% filter(review_count < 1000) %>% group_by(state) %>%
ggplot(aes(state, review_count, col=state)) + geom_boxplot() +
ggtitle("Review_count, stratified by state")
## Stars
combinewithoutuser %>% group_by(city) %>%
ggplot(aes(city, stars, col=city)) + geom_boxplot() +
ggtitle("Review_count, stratified by city")
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, stars, col=state)) + geom_boxplot() +
ggtitle("Review_count, stratified by state")
## Stars vs review_count
combinewithoutuser %>% group_by(stars) %>%
ggplot(aes(stars, review_count, col=stars, group=stars)) + geom_boxplot() +
scale_y_log10() +
ggtitle("Review_count, stratified by stars") + ylab("Review_count (log10 scale)")
test <- combinewithoutuser %>% select(business_id,type,city,state,review_count,stars,takeout:goodforbreakfast,alcohol,price:noise,wifi)
## Overall averages of attributes
r_sum <- summary(test)
r_sum_df <- as.data.frame(r_sum[4,])
colnames(r_sum_df) <- "Average"
r_s <- as.data.frame(cbind (rownames(r_sum_df), substr(r_sum_df$Average, 9,16)))
names(r_s) <- c("Attribute", "Average")
r_s[-1:-4,] %>% kable
| Attribute | Average | |
|---|---|---|
| 5 | review_count | 113.3 |
| 6 | stars | 3.545 |
| 7 | takeout | 0.9421 |
| 8 | reservation | 0.3368 |
| 9 | outdoorseating | 0.4512 |
| 10 | waiterservice | 0.6105 |
| 11 | creditcards | 0.9818 |
| 12 | goodforkids | 0.8323 |
| 13 | goodforgroups | 0.9197 |
| 14 | goodfordessert | 0.03297 |
| 15 | goodforlatenight | 0.06838 |
| 16 | goodforlunch | 0.4446 |
| 17 | goodfordinner | 0.3071 |
| 18 | goodforbrunch | 0.09984 |
| 19 | goodforbreakfast | 0.1117 |
| 20 | alcohol | 0.5349 |
| 21 | price | 1.594 |
| 22 | parking | 0.9392 |
| 23 | noise | 0.9197 |
| 24 | wifi | 0.4032 |
## plotting of binary attributes
r_s_binary<-r_s[-1:-6,]
r_s_binary<-r_s_binary[-15:-18,]
r_s_binary %>% ggplot( aes(Attribute, Average) ) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,15), ylim=c(0,20)) + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal() + geom_hline(aes(yintercept=14), color="red")
## plotting of categorical attributes
r_s_cat<-r_s[21:24,]
r_s_cat$hline <- c(4,2,3,2)
r_s_cat %>% ggplot(aes(Attribute, Average)) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,5), ylim=c(0,10)) + geom_errorbar(aes(y=hline, ymax=hline, ymin=hline), colour="#AA0000") + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal()
## Averages of attributes by state
test_pa <- test %>% filter(state=="PA")
r_sum <- summary(test_pa)
r_sum_df <- as.data.frame(r_sum[4,])
colnames(r_sum_df) <- "Average"
r_s <- as.data.frame(cbind (rownames(r_sum_df), substr(r_sum_df$Average, 9,16)))
names(r_s) <- c("Attribute", "Average")
r_s[-1:-3,] %>% kable
| Attribute | Average | |
|---|---|---|
| 4 | state | NA |
| 5 | review_count | 71.90 |
| 6 | stars | 3.63 |
| 7 | takeout | 0.9282 |
| 8 | reservation | 0.3942 |
| 9 | outdoorseating | 0.3625 |
| 10 | waiterservice | 0.7032 |
| 11 | creditcards | 0.9161 |
| 12 | goodforkids | 0.7153 |
| 13 | goodforgroups | 0.8504 |
| 14 | goodfordessert | 0.0146 |
| 15 | goodforlatenight | 0.07178 |
| 16 | goodforlunch | 0.4173 |
| 17 | goodfordinner | 0.4282 |
| 18 | goodforbrunch | 0.07664 |
| 19 | goodforbreakfast | 0.07543 |
| 20 | alcohol | 0.4939 |
| 21 | price | 1.687 |
| 22 | parking | 1.464 |
| 23 | noise | 0.9294 |
| 24 | wifi | 0.3771 |
## plotting of binary attributes
r_s_binary<-r_s[-1:-6,]
r_s_binary<-r_s_binary[-15:-18,]
r_s_binary %>% ggplot( aes(Attribute, Average) ) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,15), ylim=c(0,20)) + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal() + geom_hline(aes(yintercept=14), color="red")
## plotting of categorical attributes
r_s_cat<-r_s[21:24,]
r_s_cat$hline <- c(4,2,3,2)
r_s_cat %>% ggplot(aes(Attribute, Average)) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,5), ylim=c(0,10)) + geom_errorbar(aes(y=hline, ymax=hline, ymin=hline), colour="#AA0000") + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal()
test <- combinewithoutuser %>% select(business_id,type,city,state,review_count,stars,takeout:goodforbreakfast,alcohol,price:noise,wifi)
r_sum_bin <- test%>%group_by(state)%>%select(state,takeout:alcohol)%>%summarise_each(funs(mean))
mdat <- melt(r_sum_bin, id.vars="state")
head(mdat)
## state variable value
## 1 AZ takeout 0.9674077
## 2 IL takeout 0.9636364
## 3 NC takeout 0.9597378
## 4 NV takeout 0.9095637
## 5 PA takeout 0.9282238
## 6 WI takeout 0.9081197
ggplot(mdat, aes(variable, value, fill=state)) +
geom_bar(stat="identity", position="dodge") +
geom_hline(yintercept=1, col="red", cex=0.5) +
theme(axis.text.x = element_text(angle = 40, size = 10, hjust = 0.9, vjust = 0.9)) +
ggtitle("Average attributes by state") + scale_fill_brewer() +
theme(panel.background = element_rect(fill = "black")) +
theme(plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.border = element_blank())
r_sum_cat <- test%>%group_by(state)%>%select(state,stars,price:wifi)%>%summarise_each(funs(mean))
mdat <- melt(r_sum_cat, id.vars="state")
head(mdat)
## state variable value
## 1 AZ stars 3.551263
## 2 IL stars 3.454545
## 3 NC stars 3.541199
## 4 NV stars 3.516325
## 5 PA stars 3.630170
## 6 WI stars 3.580128
ggplot(mdat, aes(variable, value, fill=state)) +
geom_bar(stat="identity", position="dodge") +
theme(axis.text.x = element_text(angle = 40, size = 10, hjust = 0.9, vjust = 0.9)) +
ggtitle("Average attributes by state") + scale_fill_brewer() +
theme(panel.background = element_rect(fill = "black")) +
theme(plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.border = element_blank())
test %>% ggplot(aes(stars)) + geom_histogram( stat="bin", bins=17, fill="lightskyblue")
test %>% ggplot(aes(price)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")
test %>% ggplot(aes(parking)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")
test %>% ggplot(aes(noise)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")
test %>% ggplot(aes(wifi)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, population_zip, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, income_zip, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, age_zip, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, white, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, Black.or.African.American, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, American.Indian.and.Alaska.Native, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, Asian, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, Native.Hawaiian.and.Other.Pacific.Islander, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, other, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, LANDSQMI, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, popdensity_zip, col=state)) + geom_boxplot()
combinewithoutuser %>% group_by(state) %>%
ggplot(aes(state, education_zip, col=state)) + geom_boxplot()